To do: 1. use balance instead of biom table later
2. make sure not overfitting (Cross Validation?)
In [1]:
import pandas as pd
from biom import load_table
import matplotlib.pyplot as plt
import numpy as np
from skbio import TreeNode
import seaborn as sns
%matplotlib inline
sns.set_style('ticks')
In [2]:
table = pd.read_table('../data/biomtable.txt', sep='\t', dtype=str, index_col='#OTU ID').transpose()
metadata = pd.read_table('../data/mapping_cleaned_MrOS.txt', sep='\t',dtype=str, index_col='#SampleID')
In [3]:
table.head()
Out[3]:
In [4]:
from gneiss.util import match
table, metadata = match(table, metadata)
In [5]:
table.head(3)
Out[5]:
In [6]:
metadata.head(3)
Out[6]:
In [7]:
print(table.shape, metadata.shape)
In [8]:
from sklearn.cross_decomposition import PLSRegression
from skbio.stats.composition import clr, centralize, multiplicative_replacement
rfc = PLSRegression(n_components=2)
rfc.fit(X=table.values, Y=metadata.VDstatus=='sufficiency')
midx = metadata.VDstatus=='sufficiency'
lidx = metadata.VDstatus=='deficiency'
plt.plot(rfc.x_scores_[midx, 0], rfc.x_scores_[midx, 1], 'ob', label='sufficiency')
plt.plot(rfc.x_scores_[lidx, 0], rfc.x_scores_[lidx, 1], 'og', label='deficiency')
plt.xlabel('PLS1')
plt.ylabel('PLS2')
plt.legend()
Out[8]:
In [9]:
# QLCOMP (newly added covariate)
# rfc = PLSRegression(n_components=2)
# rfc.fit(X=table.values, Y=metadata.QLCOMP=='1:GOOD/EXCELLENT')
# midx = metadata.QLCOMP=='1:GOOD/EXCELLENT'
# lidx = metadata.QLCOMP=='0:PR/VPOOR/FAIR'
# plt.plot(rfc.x_scores_[midx, 0], rfc.x_scores_[midx, 1], 'ob', label='1:GOOD/EXCELLENT')
# plt.plot(rfc.x_scores_[lidx, 0], rfc.x_scores_[lidx, 1], 'og', label='0:PR/VPOOR/FAIR')
# plt.xlabel('PLS1')
# plt.ylabel('PLS2')
# plt.legend()
In [10]:
rfc = PLSRegression(n_components=2)
rfc.fit(X=table.values, Y=metadata.M1ANTIB=='0: No')
midx = metadata.M1ANTIB=='0: No'
lidx = metadata.M1ANTIB=='1: Yes'
plt.plot(rfc.x_scores_[midx, 0], rfc.x_scores_[midx, 1], 'ob', label='0: No')
plt.plot(rfc.x_scores_[lidx, 0], rfc.x_scores_[lidx, 1], 'og', label='1: Yes')
plt.xlabel('PLS1')
plt.ylabel('PLS2')
plt.legend()
Out[10]:
In [11]:
rfc = PLSRegression(n_components=2)
rfc.fit(X=table.values, Y=metadata.M1VITMND=='1: Yes')
midx = metadata.M1VITMND=='0: No'
lidx = metadata.M1VITMND=='1: Yes'
plt.plot(rfc.x_scores_[midx, 0], rfc.x_scores_[midx, 1], 'ob', label='0: No')
plt.plot(rfc.x_scores_[lidx, 0], rfc.x_scores_[lidx, 1], 'og', label='1: Yes')
plt.xlabel('PLS1')
plt.ylabel('PLS2')
plt.legend()
Out[11]:
In [12]:
rfc = PLSRegression(n_components=2)
rfc.fit(X=table.values, Y=metadata.M1ADEPR=='1: Yes')
midx = metadata.M1ADEPR=='0: No'
lidx = metadata.M1ADEPR=='1: Yes'
plt.plot(rfc.x_scores_[midx, 0], rfc.x_scores_[midx, 1], 'ob', label='0: No')
plt.plot(rfc.x_scores_[lidx, 0], rfc.x_scores_[lidx, 1], 'og', label='1: Yes')
plt.xlabel('PLS1')
plt.ylabel('PLS2')
plt.legend()
Out[12]:
In [13]:
rfc = PLSRegression(n_components=2)
rfc.fit(X=table.values, Y=metadata.M1PROBI=='1: Yes')
midx = metadata.M1PROBI=='0: No'
lidx = metadata.M1PROBI=='1: Yes'
plt.plot(rfc.x_scores_[midx, 0], rfc.x_scores_[midx, 1], 'ob', label='0: No')
plt.plot(rfc.x_scores_[lidx, 0], rfc.x_scores_[lidx, 1], 'og', label='1: Yes')
plt.xlabel('PLS1')
plt.ylabel('PLS2')
plt.legend()
Out[13]:
In [ ]: